PART 4: Cancer subtyping and prognosis analysis

Shixiang Wang, Ziyu Tao, Tao Wu, Xue-Song Liu (Corresponding author)

2021-08-18

In this part, we would like to cluster all tumors based on CN signatures and use the clusters to explore Pan-Cancer heterogeneity.

Data integration

Many raw data have been processed by tools (or collected) and cleaned. This part I mainly describe how to combine them to construct the integrated dataset.

If you care about the data (source) shown below, please check the last section of PART 0.

library(pacman)
p_load(
  tidyverse,
  data.table,
  sigminer,
  IDConverter
)

Load pre-processed data.

pcawg_cn_obj <- readRDS("../data/pcawg_cn_obj.rds")
pcawg_cn_sig <- readRDS("../data/pcawg_cn_sigs_CN176_signature.rds")
pcawg_samp_info_sp <- readRDS("../data/pcawg_samp_info_sp.rds")
samp_summary <- pcawg_cn_obj@summary.per.sample
samp_summary$n_loh <- NULL

Check if all samples are recorded.

all(pcawg_cn_obj@summary.per.sample$sample %in% pcawg_samp_info_sp$pcawg_specimen_histology_August2016_v9$icgc_specimen_id)
pcawg_samp_info_sp <- lapply(pcawg_samp_info_sp, function(x) {
  colnames(x)[1] <- "sample"
  x
})
pcawg_samp_info <- purrr::reduce(pcawg_samp_info_sp, dplyr::full_join, by = "sample")
pcawg_samp_info <- pcawg_samp_info %>% dplyr::filter(sample %in% pcawg_cn_obj@summary.per.sample$sample)

rm(pcawg_samp_info_sp)

Refer to PCAWG Mutational Signatures Working Group et al. (2020), some cancer types with small sample size should be combined.

table(pcawg_samp_info$histology_abbreviation)
pcawg_samp_info <- pcawg_samp_info %>%
  mutate(
    cancer_type = case_when(
      startsWith(histology_abbreviation, "Breast") ~ "Breast",
      startsWith(histology_abbreviation, "Biliary-AdenoCA") ~ "Biliary-AdenoCA",
      histology_abbreviation %in% c("Bone-Epith", "Bone-Benign") ~ "Bone-Other",
      startsWith(histology_abbreviation, "Cervix") ~ "Cervix",
      histology_abbreviation %in% c("Myeloid-AML, Myeloid-MDS", "Myeloid-AML, Myeloid-MPN", "Myeloid-MDS", "Myeloid-MPN") ~ "Myeloid-MDS/MPN",
      TRUE ~ histology_abbreviation
    )
  )

Select important features.

pcawg_samp_info <- pcawg_samp_info %>%
  dplyr::select(c(
    "sample", "_PATIENT", "donor_sex", "donor_age_at_diagnosis",
    "donor_survival_time", "donor_vital_status", "first_therapy_type",
    "tobacco_smoking_history_indicator", "alcohol_history",
    "dcc_project_code", "dcc_specimen_type",
    "histology_abbreviation", "cancer_type",
    "tumour_stage",
    "level_of_cellularity"
  ))
colnames(pcawg_samp_info)[2] <- "donor_id"

# Remove duplicated samples
pcawg_samp_info <- pcawg_samp_info[!duplicated(pcawg_samp_info$sample), ]

Get signature activities.

pcawg_activity <- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")

expo_abs <- pcawg_activity$absolute[, lapply(.SD, function(x) if (is.numeric(x)) round(x, digits = 3) else x)]
colnames(expo_abs) <- c("sample", paste0("Abs_", colnames(expo_abs)[-1]))


expo_rel <- pcawg_activity$relative[, lapply(.SD, function(x) if (is.numeric(x)) round(x, digits = 3) else x)]
colnames(expo_rel) <- c("sample", paste0("Rel_", colnames(expo_rel)[-1]))

pcawg_activity2 <- dplyr::left_join(expo_abs, expo_rel, by = "sample")
pcawg_activity2 <- dplyr::left_join(pcawg_activity2, pcawg_activity$similarity, by = "sample")

pcawg_activity2$keep <- pcawg_activity2$similarity > 0.75
pcawg_activity <- pcawg_activity2

rm(pcawg_activity2, expo_abs, expo_rel)

Read tumor purity & ploidy & WGD status.

dt_pp <- fread("../data/PCAWG/consensus.20170217.purity.ploidy_sp")
dt_pp <- dt_pp[, c("samplename", "purity", "ploidy", "wgd_status")]
colnames(dt_pp)[1] <- "sample"

Calculate fusion events in all samples.

fusion <- fread("../data/PCAWG/pcawg3_fusions_PKU_EBI.gene_centric.sp.xena")
fusion <- as.matrix(fusion[, -1])
fusion <- apply(fusion, 2, sum)
fusion <- dplyr::tibble(
  sample = names(fusion),
  n_fusion = as.numeric(fusion)
)

Load HRD status in all samples.

HRD <- readRDS("../data/PCAWG/pcawg_CHORD.rds")
HRD <- HRD %>%
  dplyr::select(icgc_specimen_id, is_hrd, genotype_monoall) %>%
  set_names(c("sample", "hrd_status", "hrd_genotype")) %>%
  unique()

Read LOH data.

LOH <- readRDS("../data/pcawg_loh.rds")

Read Chromothripsis data and clean it.

# http://compbio.med.harvard.edu/chromothripsis/
chromothripsis <- fread("../data/PCAWG/PCAWG-ShatterSeek.txt.gz")
chromothripsis <- chromothripsis[, c("icgc_donor_id", "type_chromothripsis", "chromo_label")]

# Check label
table(chromothripsis$chromo_label)
any(is.na(chromothripsis$chromo_label))

# Summarize
chromothripsis_summary <- chromothripsis[
  , .(
    n_chromo = sum(chromo_label != "No"),
    chromo_type = paste0(na.omit(type_chromothripsis), collapse = "/")
  ),
  by = icgc_donor_id
]

chromothripsis_summary <- merge(chromothripsis_summary, pcawg_full[, c("icgc_donor_id", "icgc_specimen_id")],
  by = "icgc_donor_id"
) %>% unique()
chromothripsis_summary$icgc_donor_id <- NULL
colnames(chromothripsis_summary)[3] <- "sample"
rm(chromothripsis)

Read AmpliconArchitect results.

aa1 <- readxl::read_excel("../data/PCAWG/ecDNA.xlsx", skip = 1)
aa2 <- readxl::read_excel("../data/PCAWG/ecDNA.xlsx", sheet = 3)

table(aa1$amplicon_classification)
table(aa2$sample_classification)
all(aa1$sample_barcode %in% aa2$sample_barcode)

aa_summary <- aa1 %>%
  dplyr::group_by(sample_barcode) %>%
  dplyr::summarise(
    n_amplicon_BFB = sum(amplicon_classification == "BFB", na.rm = TRUE),
    n_amplicon_ecDNA = sum(amplicon_classification == "Circular", na.rm = TRUE),
    n_amplicon_HR = sum(amplicon_classification == "Heavily-rearranged", na.rm = TRUE),
    n_amplicon_Linear = sum(amplicon_classification == "Linear", na.rm = TRUE),
    .groups = "drop"
  )

data.table::setDT(aa_summary)

custom_dt <- IDConverter::pcawg_simple[, c("submitted_specimen_id", "icgc_specimen_id")]
custom_dt <- custom_dt[startsWith(submitted_specimen_id, "TCGA")]
custom_dt[, submitted_specimen_id := substr(submitted_specimen_id, 1, 15)]

aa_summary[
  , sample := ifelse(
    startsWith(sample_barcode, "SA"),
    convert_pcawg(sample_barcode, from = "icgc_sample_id", to = "icgc_specimen_id"),
    convert_custom(sample_barcode, from = "submitted_specimen_id", to = "icgc_specimen_id", dt = custom_dt)
  )
]

## Filter out samples not in PCAWG
aa_summary <- aa_summary[!is.na(sample)][, sample_barcode := NULL]

rm(aa1, aa2, custom_dt)

Read TelomereHunter results.

TC <- readxl::read_excel("../data/PCAWG/TelomereContent.xlsx")
TC <- TC %>% dplyr::select(c("icgc_specimen_id", "tel_content_log2"))
colnames(TC)[1] <- "sample"
data.table::setDT(TC)
apobec <- read_tsv("../data/PCAWG/MAF_Aug31_2016_sorted_A3A_A3B_comparePlus.txt", col_types = cols())
apobec <- apobec %>%
  dplyr::select(c(1, 5)) %>%
  data.table::as.data.table() %>%
  setNames(c("sample", "APOBEC_mutations"))

Combine and save result combined data.

pcawg_samp_info <- purrr::reduce(list(
  pcawg_samp_info,
  pcawg_activity,
  samp_summary,
  dt_pp,
  fusion,
  HRD,
  LOH,
  chromothripsis_summary,
  aa_summary,
  TC,
  apobec
),
dplyr::left_join,
by = "sample"
)

saveRDS(pcawg_samp_info, file = "../data/pcawg_sample_tidy_info.rds")

## Remove all objects
rm(list = ls())

Clustering with signature activity

Here, we use recent consensus clustering toolkit cola.

We take 2 steps to obtain a robust clustering result:

  1. We sort all samples by their total signature activities and randomly select 500 samples for running multiple methods provided by cola at the same time, then pick up the optimal method combination.
  2. We run clustering for all samples by the method combination above and check the cola report to determine the suitable cluster number.

Step 1: select suitable method combination

There are 2 key arguments in cola clustering function:

  1. top_value_method: used to extract rows (i.e. signatures here) with top values.
  2. partition_method: used to select partition method.

We try all combinations with randomly selected 500 samples to explore the suitable setting.

library(cola)
library(tidyverse)

act <- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")

df <- purrr::reduce(
  list(
    act$absolute,
    act$similarity
  ),
  dplyr::left_join,
  by = "sample"
)

mat <- df %>%
  filter(similarity > 0.75) %>%
  select(sample, starts_with("CNS")) %>%
  column_to_rownames("sample") %>%
  t()
mat_adj <- adjust_matrix(mat)
# 1 rows have been removed with too low variance (sd < 0.05 quantile)

rownames(mat_adj)

# Select suitable parameters ----------------------------------------------

ds <- colSums(mat_adj)
boxplot(ds)

set.seed(123)
select_samps <- sample(names(sort(ds)), 500)

boxplot(ds[select_samps])

rl_samp <- run_all_consensus_partition_methods(mat_adj[, select_samps], top_n = 13, mc.cores = 8, max_k = 10)
cola_report(rl_samp, output_dir = "../output/cola_report/pcawg_sigs_500_sampls", mc.cores = 8)

rm(rl_samp)

The cola output report is very big and isn’t suitable to show, here I only include key figures to determine the parameter setting.

# Cluster number 2
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-1-1.png")

# Cluster number 3
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-2-1.png")

# Cluster number 4
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-3-1.png")

# Cluster number 5
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-4-1.png")

# Cluster number 6
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-5-1.png")

# Cluster number 7
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-6-1.png")

# Cluster number 8
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-7-1.png")

# Cluster number 9
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-8-1.png")

# Cluster number 10
knitr::include_graphics("cola-test-figs/tab-collect-stats-from-consensus-partition-list-9-1.png")

Step 2: select suitable cluster number

From figures above we do clearly see that the skmeans partition method fits our data and any top_value_method is okay (because all signatures are used). Finally, we choose combine skmeans (partition method) and ATC (top value method introduced firstly by cola) to run clustering for all samples.

final <- run_all_consensus_partition_methods(
  mat_adj,
  top_value_method = "ATC",
  partition_method = "skmeans",
  top_n = 13, mc.cores = 8, max_k = 10
)
cola_report(final, output_dir = "../output/cola_report/pcawg_sigs_all_sampls", mc.cores = 8)

saveRDS(final, file = "../data/pcawg_cola_result.rds")

The report can be viewed at here.

Next, we will load the data and visualize to select suitable cluster number.

library(cola)
library(ComplexHeatmap)
library(tidyverse)

cluster_res <- readRDS("../data/pcawg_cola_result.rds")
cluster_res <- cluster_res["ATC", "skmeans"]
cluster_res
A 'ConsensusPartition' object with k = 2, 3, 4, 5, 6, 7, 8, 9, 10.
  On a matrix with 13 rows and 2737 columns.
  Top rows (13) are extracted by 'ATC' method.
  Subgroups are detected by 'skmeans' method.
  Performed in total 450 partitions by row resampling.
  Best k for subgroups seems to be 2.

Following methods can be applied to this 'ConsensusPartition' object:
 [1] "cola_report"             "collect_classes"        
 [3] "collect_plots"           "collect_stats"          
 [5] "colnames"                "compare_signatures"     
 [7] "consensus_heatmap"       "dimension_reduction"    
 [9] "functional_enrichment"   "get_anno_col"           
[11] "get_anno"                "get_classes"            
[13] "get_consensus"           "get_matrix"             
[15] "get_membership"          "get_param"              
[17] "get_signatures"          "get_stats"              
[19] "is_best_k"               "is_stable_k"            
[21] "membership_heatmap"      "ncol"                   
[23] "nrow"                    "plot_ecdf"              
[25] "rownames"                "select_partition_number"
[27] "show"                    "suggest_best_k"         
[29] "test_to_known_factors"  

Show different statistics along with different cluster number k, which helps to determine the “best k.”

select_partition_number(cluster_res)

There are many details about all statistical measures shown above at cola vignette. The best k 2 is suggested by cola package due to its better statistical measures. However, in this practice, I think 5 is more suitable.

I have the following reasons:

  1. Measure 1 - PAC reaches convergence at k = 5.
  2. Measure Silhouette reaches local optimum at k = 5.
  3. Measure Concordance reaches local optimum at k = 5.

I will draw another plot to show why the two ks are suitable.

collect_classes(cluster_res)

We can see that the probability constitution of class assignment for subgroups for k = 6, 7, 8 are similar to k = 5.

Draw all plots into one single page.

pdf(file = "../output/cola_all_plots.pdf", width = 16, height = 10)
collect_plots(cluster_res)
dev.off()

Now we get the cluster labels of samples for downstream analysis.

pcawg_clusters <- get_classes(cluster_res, k = 5)
saveRDS(pcawg_clusters, file = "../data/pcawg_clusters.rds")

Heatmap for clustering and genotype/phenotype features

Load tidy sample info.

tidy_info <- readRDS("../data/pcawg_sample_tidy_info.rds")

Clean some info.

detect_any <- function(x, p) {
  if (length(p) == 1L) {
    y <- stringr::str_detect(x, p)
  } else {
    y <- purrr::map(p, ~ stringr::str_detect(x, .))
    y <- purrr::reduce(y, `|`)
  }
  y[is.na(y)] <- FALSE
  y
}

tidy_info <- tidy_info %>%
  dplyr::mutate(
    # Roughly reassign the staging to TNM stages
    # Basically follow 7th TNM staging version, see plot <https://bkimg.cdn.bcebos.com/pic/a8014c086e061d952019ec8773f40ad162d9ca36?x-bce-process=image/watermark,image_d2F0ZXIvYmFpa2U4MA==,g_7,xp_5,yp_5>
    #
    # Other references:
    # https://www.cancer.org/treatment/understanding-your-diagnosis/staging.html
    # https://www.cancer.gov/publications/dictionaries/cancer-terms/def/abcd-rating
    # https://web.archive.org/web/20081004121942/http://www.oncologychannel.com/prostatecancer/stagingsystems.shtml
    # https://baike.baidu.com/item/TNM%E5%88%86%E6%9C%9F%E7%B3%BB%E7%BB%9F/10700513
    tumour_stage = case_when(
      tumour_stage %in% c("1", "1a", "1b", "A", "I", "IA", "IB", "T1c", "T1N0") | detect_any(tumour_stage, c("T1N0M0", "T1aN0M0", "T1bN0M0", "T2aN0M0")) ~ "I",
      tumour_stage %in% c("2", "2a", "2b", "B", "II", "IIA", "IIB", "T3a", "T3aN0") | detect_any(tumour_stage, c("T2bN0M0", "T3N0M0", "T[^34]N1.*M0")) ~ "II",
      tumour_stage %in% c("3", "3a", "3b", "3c", "C", "III", "IIIA", "IIIB", "IIIC") | detect_any(tumour_stage, c("N[23].*M0", "T3N1.*M0", "T4.*M0")) ~ "III",
      tumour_stage %in% c("4", "IV", "IVA") | detect_any(tumour_stage, "M[^0X]") ~ "IV",
      TRUE ~ "Unknown"
    ),
    first_therapy_type = ifelse(first_therapy_type == "monoclonal antibodies (for liquid tumours)",
      "other therapy", first_therapy_type
    )
  )

Generate annotation data for plotting.

anno_df <- tidy_info %>%
  dplyr::filter(keep) %>%
  dplyr::select(
    sample,
    donor_age_at_diagnosis, n_of_amp, n_of_del, n_of_vchr, cna_burden, purity, ploidy,
    n_LOH, n_fusion, n_chromo, n_amplicon_BFB, n_amplicon_ecDNA, n_amplicon_HR, n_amplicon_Linear,
    tel_content_log2,
    donor_sex, tumour_stage, wgd_status, hrd_status
  ) %>%
  dplyr::mutate_at(
    vars(n_fusion, n_chromo, n_amplicon_BFB, n_amplicon_ecDNA, n_amplicon_HR, n_amplicon_Linear),
    ~ ifelse(is.na(.), 0, .)
  ) %>%
  dplyr::mutate(hrd_status = ifelse(hrd_status, "hrd", "no_hrd")) %>%
  rename(
    Age = donor_age_at_diagnosis,
    AMPs = n_of_amp,
    DELs = n_of_del,
    Affected_Chrs = n_of_vchr,
    CNA_Burden = cna_burden,
    Purity = purity,
    Ploidy = ploidy,
    LOHs = n_LOH,
    Fusions = n_fusion,
    Chromothripsis = n_chromo,
    BFBs = n_amplicon_BFB,
    ecDNAs = n_amplicon_ecDNA,
    HRs = n_amplicon_HR,
    Linear_Amplicons = n_amplicon_Linear,
    `Telomere_Content(log2)` = tel_content_log2,
    Sex = donor_sex,
    Tumor_Stage = tumour_stage,
    WGD_Status = wgd_status,
    HRD_Status = hrd_status
  ) %>%
  tibble::column_to_rownames("sample")

saveRDS(anno_df, file = "../data/pcawg_tidy_anno_df.rds")

left_annotation <- rowAnnotation(foo = anno_text(paste0("CNS", 1:13)))

Show heatmap for clusters based on signature activities.

get_signatures(cluster_res,
  k = 5, silhouette_cutoff = 0.2, anno = anno_df,
  left_annotation = left_annotation
)
* 2674/2737 samples (in 5 classes) remain after filtering by silhouette (>= 0.2).
* cache hash: 4b9a944dfe09dbf61fff67d0357a67aa (seed 888).
* calculating row difference between subgroups by Ftest.
* split rows into 4 groups by k-means clustering.
* 13 signatures (100.0%) under fdr < 0.05, group_diff > 0.
* making heatmaps for signatures.

Save to file.

pdf(file = "../output/pcawg_cluster_heatmap.pdf", width = 22, height = 10, onefile = FALSE)
get_signatures(cluster_res, k = 5, silhouette_cutoff = 0.2, anno = anno_df, left_annotation = left_annotation)
dev.off()

Exploration of patients’ prognosis difference across clusters

Clean data for survival analysis

library(ezcox)
library(tidyverse)

cluster_df <- readRDS("../data/pcawg_clusters.rds") %>%
  as.data.frame() %>%
  tibble::rownames_to_column("sample")

tidy_info <- readRDS("../data/pcawg_sample_tidy_info.rds")

df_os <- tidy_info %>%
  dplyr::filter(keep) %>%
  dplyr::select(sample, donor_vital_status, donor_survival_time) %>%
  purrr::set_names(c("sample", "os", "time")) %>%
  na.omit() %>%
  mutate(
    os = ifelse(os == "deceased", 1, 0),
    time = time / 365
  ) %>%
  left_join(cluster_df, by = "sample")

colnames(df_os)[4] <- "cluster"
# Use the group with minimal CNV level as reference group
df_os$cluster <- paste0("subgroup", df_os$cluster)

Available sample number with OS data:

nrow(df_os)
[1] 1729

Forest plot

It’s pretty easy to visualize the relationship between subgroups and patient’s prognosis with forest plot by R package ezcox developed by me before.

show_forest(df_os, covariates = "cluster", status = "os", add_caption = FALSE, merge_models = TRUE)

K-M plot

We can also check the OS difference with K-M plot.

library(survival)
library(survminer)
sfit <- survfit(Surv(time, os) ~ cluster, data = df_os)
ggsurvplot(sfit,
  pval = TRUE,
  fun = "pct",
  xlab = "Time (in years)",
  # palette = "jco",
  legend.title = ggplot2::element_blank(),
  legend.labs = paste0("subgroup", 1:5),
  break.time.by = 5,
  risk.table = TRUE,
  tables.height = 0.4
)

The result has similar meaning as forest plot.

The difference of genotype/phenotype measures across clusters

Signatures

Load SBS/DBS/ID signature activities generated from PCAWG Nature Study.

pcawg_sbs <- read_csv("../data/PCAWG/PCAWG_sigProfiler_SBS_signatures_in_samples.csv")
pcawg_dbs <- read_csv("../data/PCAWG/PCAWG_sigProfiler_DBS_signatures_in_samples.csv")
pcawg_id <- read_csv("../data/PCAWG/PCAWG_SigProfiler_ID_signatures_in_samples.csv")

Merge data.

df_merged <- purrr::reduce(
  list(
    tidy_info,
    pcawg_sbs[, -c(1, 3)] %>% dplyr::rename(sample = `Sample Names`) %>%
      # Drop Possible sequencing artefact associated signatures
      dplyr::select(-SBS43, -c(SBS45:SBS60)),
    pcawg_dbs[, -c(1, 3)] %>% dplyr::rename(sample = `Sample Names`),
    pcawg_id[, -c(1, 3)] %>% dplyr::rename(sample = `Sample Names`),
    cluster_df[, 1:2] %>% purrr::set_names(c("sample", "cluster"))
  ),
  dplyr::left_join,
  by = "sample"
) %>%
  dplyr::filter(!is.na(cluster)) %>%
  mutate(cluster = paste0("subgroup", cluster))

colnames(df_merged) <- gsub("Abs_", "", colnames(df_merged))

Enrichment analysis for copy number signatures

library(sigminer)
enrich_result_cn <- group_enrichment(
  df_merged,
  grp_vars = "cluster",
  enrich_vars = paste0("CNS", 1:14),
  co_method = "wilcox.test"
)
enrich_result_cn$enrich_var <- factor(enrich_result_cn$enrich_var, paste0("CNS", 1:14))

p <- show_group_enrichment(
  enrich_result_cn,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip()

Enrichment analysis for SBS signatures

nm_sbs <- colnames(df_merged)[grepl("SBS", colnames(df_merged))]

enrich_result_sbs <- group_enrichment(
  df_merged,
  grp_vars = "cluster",
  enrich_vars = nm_sbs,
  co_method = "wilcox.test"
)
enrich_result_sbs$enrich_var <- factor(enrich_result_sbs$enrich_var, nm_sbs)

p <- show_group_enrichment(
  enrich_result_sbs,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip()

Enrichment analysis for DBS signatures

nm_dbs <- colnames(df_merged)[grepl("DBS", colnames(df_merged))]

enrich_result_dbs <- group_enrichment(
  df_merged,
  grp_vars = "cluster",
  enrich_vars = nm_dbs,
  co_method = "wilcox.test"
)
enrich_result_dbs$enrich_var <- factor(enrich_result_dbs$enrich_var, nm_dbs)

p <- show_group_enrichment(
  enrich_result_dbs,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip()

Enrichment analysis for ID signatures

nm_id <- colnames(df_merged)[grepl("^ID[0-9]+", colnames(df_merged))]

enrich_result_id <- group_enrichment(
  df_merged,
  grp_vars = "cluster",
  enrich_vars = nm_id,
  co_method = "wilcox.test"
)
enrich_result_id$enrich_var <- factor(enrich_result_id$enrich_var, nm_id)

p <- show_group_enrichment(
  enrich_result_id,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip()

Combined COSMIC signature result

Merge all COSMIC signature results.

enrich_result_cosmic <- purrr::reduce(
  list(enrich_result_sbs, enrich_result_dbs, enrich_result_id),
  rbind
)

移除带 NA 和 Inf 的 signatures 结果,然后绘制一个汇总图,对 signatures 进行聚类,以确定 signature 排序。

Word cloud plot for COSMIC signature etiologies

There are so many COSMIC signatures above, it is not easy to summarize the data. Considering all COSMIC signatures are labeled and many of them have been assigned to a specific etiology, here we try to use word cloud plot to summarize the result above.

Load etiologies of COSMIC signatures.

cosmic_ets <- purrr::reduce(
  list(
    sigminer::get_sig_db("SBS")$aetiology %>% tibble::rownames_to_column("sig_name"),
    sigminer::get_sig_db("DBS")$aetiology %>% tibble::rownames_to_column("sig_name"),
    sigminer::get_sig_db("ID")$aetiology %>% tibble::rownames_to_column("sig_name")
  ),
  rbind
)
DT::datatable(cosmic_ets)

Generate a data.frame for plotting. We only keep significantly and positively enriched signatures in each subgroup. We note that descriptions for many signatures are too long, thus we use short names.

df_ets <- enrich_result_cosmic %>%
  dplyr::filter(p_value < 0.05 & measure_observed > 1) %>%
  dplyr::select(grp1, enrich_var) %>%
  dplyr::left_join(cosmic_ets, by = c("enrich_var" = "sig_name")) %>%
  dplyr::mutate(
    aetiology = dplyr::case_when(
      grepl("APOBEC", aetiology) ~ "APOBEC",
      grepl("Tobacco", aetiology) ~ "Tobacco",
      grepl("clock", aetiology) ~ "Clock",
      grepl("Ultraviolet", aetiology, ignore.case = TRUE) ~ "UV",
      grepl("homologous recombination", aetiology) ~ "HRD",
      grepl("mismatch repair", aetiology) ~ "dMMR",
      grepl("Slippage", aetiology) ~ "Slippage",
      grepl("base excision repair", aetiology) ~ "dBER",
      grepl("NHEJ", aetiology) ~ "NHEJ repair",
      grepl("reactive oxygen", aetiology) ~ "ROS",
      grepl("Polymerase epsilon exonuclease", aetiology) ~ "Polymerase epsilon mutation",
      grepl("eta somatic hypermutation", aetiology) ~ "Polimerase eta hypermutation",
      TRUE ~ aetiology
    )
  ) %>%
  dplyr::count(grp1, aetiology) %>%
  dplyr::filter(!aetiology %in% c("Unknown", "Possible sequencing artefact"))

Plotting.

library(ggwordcloud)

set.seed(42)
ggplot(df_ets, aes(label = aetiology, size = n)) +
  geom_text_wordcloud_area(eccentricity = .35, shape = "square") +
  scale_size_area(max_size = 14) +
  facet_wrap(~grp1) +
  theme_bw()

Other integrated variables

df_others <- readRDS("../data/pcawg_tidy_anno_df.rds") %>%
  tibble::rownames_to_column("sample") %>%
  dplyr::left_join(
    cluster_df[, 1:2] %>% purrr::set_names(c("sample", "cluster")),
    by = "sample"
  ) %>%
  dplyr::mutate(
    Tumor_Stage = dplyr::case_when(
      Tumor_Stage == "I" ~ 1,
      Tumor_Stage == "II" ~ 2,
      Tumor_Stage == "III" ~ 3,
      Tumor_Stage == "IV" ~ 4
    ),
    isFemale = Sex == "female",
    isWGD = WGD_Status == "wgd",
    isHRD = HRD_Status == "hrd",
    cluster = paste0("subgroup", cluster)
  ) %>%
  dplyr::select(-c(Sex, WGD_Status, HRD_Status))
nm_others <- setdiff(colnames(df_others), c("sample", "cluster"))

enrich_result_others <- group_enrichment(
  df_others,
  grp_vars = "cluster",
  enrich_vars = nm_others,
  co_method = "wilcox.test"
)
enrich_result_others$enrich_var <- factor(enrich_result_others$enrich_var, nm_others)

p <- show_group_enrichment(
  enrich_result_others,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip()

The HR here represents Heavily rearrangement, don’t mix it with HRD which means Defective homologous recombination DNA damage repair.

Driver genes

We have already observed that there are different mutational processes across different groups. Here we try to go further study if different driver genes activate in different groups.

We obtained coding driver mutations from UCSC Xena database. For known driver genes, we focus on driver gene list obtained from Martínez-Jiménez et al. (2020).

load("../data/pcawg_driver_info.RData")
df_mut <- dplyr::left_join(df_others[, c("sample", "cluster")],
  pcawg_driver_number,
  by = "sample"
)
colnames(df_mut)[3] <- "Driver_variants"
df_mut$Driver_variants <- ifelse(is.na(df_mut$Driver_variants), 0, df_mut$Driver_variants)

driver_gene_num <- pcawg_driver_gene %>%
  dplyr::group_by(sample, ROLE) %>%
  dplyr::summarise(driver_genes = n(), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = "ROLE", values_from = "driver_genes", values_fill = 0)

df_mut <- dplyr::left_join(df_mut, driver_gene_num, by = "sample")
colnames(df_mut)[4:5] <- c("Act_genes", "LoF_genes")
df_mut$Act_genes <- ifelse(is.na(df_mut$Act_genes), 0, df_mut$Act_genes)
df_mut$LoF_genes <- ifelse(is.na(df_mut$LoF_genes), 0, df_mut$LoF_genes)

Also merge into info of each driver gene.

df_mut2 <- df_mut %>%
  dplyr::left_join(
    pcawg_driver_gene %>%
      tidyr::pivot_wider(names_from = "gene", values_from = "ROLE") %>%
      dplyr::mutate_at(vars(-sample), ~ !is.na(.)),
    by = "sample"
  )

## Filter out driver genes with < 10 mutation
gene_sum <- sort(colSums(df_mut2[, -c(1:5)], na.rm = TRUE), decreasing = TRUE)


# 基因按突变数量排序

df_mut2 <- df_mut2 %>% dplyr::select_at(c(colnames(df_mut2)[1:5], names(gene_sum[gene_sum >= 10])))

Save the data.

saveRDS(df_mut2, file = "../data/pcawg_mut_df.rds")
nm_mut <- setdiff(colnames(df_mut2), c("sample", "cluster"))

enrich_result_mut <- group_enrichment(
  df_mut2,
  grp_vars = "cluster",
  enrich_vars = nm_mut,
  co_method = "wilcox.test"
)

Filter out genes insignificant in all subgroups.

keep_df <- enrich_result_mut[, .(count = sum(p_value < 0.05)), by = enrich_var]
drop_vars <- keep_df[count == 0]$enrich_var

enrich_result_mut2 <- enrich_result_mut[!enrich_var %in% drop_vars]

Plotting.

enrich_result_mut2$enrich_var <- factor(enrich_result_mut2$enrich_var, setdiff(nm_mut, drop_vars))

keep_vars <- setdiff(nm_mut, drop_vars)[-c(1:3)]
labels <- c(nm_mut[1:3], paste0(keep_vars, " (", gene_sum[keep_vars], ")"))

p <- show_group_enrichment(
  enrich_result_mut2,
  fill_by_p_value = TRUE,
  cut_p_value = TRUE,
  return_list = T
)
p <- p$cluster + labs(x = NULL, y = NULL)
p + coord_flip() + scale_x_discrete(labels = labels)

rm(list = ls())
Martínez-Jiménez, Francisco, Ferran Muiños, Inés Sentís, Jordi Deu-Pons, Iker Reyes-Salazar, Claudia Arnedo-Pac, Loris Mularoni, et al. 2020. “A Compendium of Mutational Cancer Driver Genes.” Nature Reviews Cancer 20 (10): 555–72. https://doi.org/10.1038/s41568-020-0290-x.
PCAWG Mutational Signatures Working Group, PCAWG Consortium, Ludmil B. Alexandrov, Jaegil Kim, Nicholas J. Haradhvala, Mi Ni Huang, Alvin Wei Tian Ng, et al. 2020. “The Repertoire of Mutational Signatures in Human Cancer.” Nature 578 (7793): 94–101. https://doi.org/10.1038/s41586-020-1943-3.